!pip install wordcloud
!pip install nltk
!pip install yfinance
!pip install plotly
!pip install pandas_datareader
!pip install tabulate
Requirement already satisfied: wordcloud in /opt/conda/lib/python3.9/site-packages (1.8.1) Requirement already satisfied: matplotlib in /opt/conda/lib/python3.9/site-packages (from wordcloud) (3.4.3) Requirement already satisfied: pillow in /opt/conda/lib/python3.9/site-packages (from wordcloud) (8.3.2) Requirement already satisfied: numpy>=1.6.1 in /opt/conda/lib/python3.9/site-packages (from wordcloud) (1.20.3) Requirement already satisfied: pyparsing>=2.2.1 in /opt/conda/lib/python3.9/site-packages (from matplotlib->wordcloud) (2.4.7) Requirement already satisfied: cycler>=0.10 in /opt/conda/lib/python3.9/site-packages (from matplotlib->wordcloud) (0.10.0) Requirement already satisfied: python-dateutil>=2.7 in /opt/conda/lib/python3.9/site-packages (from matplotlib->wordcloud) (2.8.2) Requirement already satisfied: kiwisolver>=1.0.1 in /opt/conda/lib/python3.9/site-packages (from matplotlib->wordcloud) (1.3.2) Requirement already satisfied: six in /opt/conda/lib/python3.9/site-packages (from cycler>=0.10->matplotlib->wordcloud) (1.16.0) Requirement already satisfied: nltk in /opt/conda/lib/python3.9/site-packages (3.6.7) Requirement already satisfied: tqdm in /opt/conda/lib/python3.9/site-packages (from nltk) (4.62.3) Requirement already satisfied: regex>=2021.8.3 in /opt/conda/lib/python3.9/site-packages (from nltk) (2021.11.10) Requirement already satisfied: joblib in /opt/conda/lib/python3.9/site-packages (from nltk) (1.1.0) Requirement already satisfied: click in /opt/conda/lib/python3.9/site-packages (from nltk) (8.0.3) Requirement already satisfied: yfinance in /opt/conda/lib/python3.9/site-packages (0.1.68) Requirement already satisfied: lxml>=4.5.1 in /opt/conda/lib/python3.9/site-packages (from yfinance) (4.7.1) Requirement already satisfied: pandas>=0.24 in /opt/conda/lib/python3.9/site-packages (from yfinance) (1.3.4) Requirement already satisfied: multitasking>=0.0.7 in /opt/conda/lib/python3.9/site-packages (from yfinance) (0.0.10) Requirement already satisfied: numpy>=1.15 in /opt/conda/lib/python3.9/site-packages (from yfinance) (1.20.3) Requirement already satisfied: requests>=2.26 in /opt/conda/lib/python3.9/site-packages (from yfinance) (2.26.0) Requirement already satisfied: python-dateutil>=2.7.3 in /opt/conda/lib/python3.9/site-packages (from pandas>=0.24->yfinance) (2.8.2) Requirement already satisfied: pytz>=2017.3 in /opt/conda/lib/python3.9/site-packages (from pandas>=0.24->yfinance) (2021.3) Requirement already satisfied: charset-normalizer~=2.0.0 in /opt/conda/lib/python3.9/site-packages (from requests>=2.26->yfinance) (2.0.0) Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.9/site-packages (from requests>=2.26->yfinance) (3.1) Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.9/site-packages (from requests>=2.26->yfinance) (2021.10.8) Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.9/site-packages (from requests>=2.26->yfinance) (1.26.7) Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.9/site-packages (from python-dateutil>=2.7.3->pandas>=0.24->yfinance) (1.16.0) Requirement already satisfied: plotly in /opt/conda/lib/python3.9/site-packages (5.5.0) Requirement already satisfied: six in /opt/conda/lib/python3.9/site-packages (from plotly) (1.16.0) Requirement already satisfied: tenacity>=6.2.0 in /opt/conda/lib/python3.9/site-packages (from plotly) (8.0.1) Requirement already satisfied: pandas_datareader in /opt/conda/lib/python3.9/site-packages (0.10.0) Requirement already satisfied: requests>=2.19.0 in /opt/conda/lib/python3.9/site-packages (from pandas_datareader) (2.26.0) Requirement already satisfied: pandas>=0.23 in /opt/conda/lib/python3.9/site-packages (from pandas_datareader) (1.3.4) Requirement already satisfied: lxml in /opt/conda/lib/python3.9/site-packages (from pandas_datareader) (4.7.1) Requirement already satisfied: python-dateutil>=2.7.3 in /opt/conda/lib/python3.9/site-packages (from pandas>=0.23->pandas_datareader) (2.8.2) Requirement already satisfied: pytz>=2017.3 in /opt/conda/lib/python3.9/site-packages (from pandas>=0.23->pandas_datareader) (2021.3) Requirement already satisfied: numpy>=1.17.3 in /opt/conda/lib/python3.9/site-packages (from pandas>=0.23->pandas_datareader) (1.20.3) Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.9/site-packages (from requests>=2.19.0->pandas_datareader) (2021.10.8) Requirement already satisfied: charset-normalizer~=2.0.0 in /opt/conda/lib/python3.9/site-packages (from requests>=2.19.0->pandas_datareader) (2.0.0) Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.9/site-packages (from requests>=2.19.0->pandas_datareader) (1.26.7) Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.9/site-packages (from requests>=2.19.0->pandas_datareader) (3.1) Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.9/site-packages (from python-dateutil>=2.7.3->pandas>=0.23->pandas_datareader) (1.16.0) Collecting tabulate Downloading tabulate-0.8.9-py3-none-any.whl (25 kB) Installing collected packages: tabulate Successfully installed tabulate-0.8.9
import requests
from bs4 import BeautifulSoup
import pandas as pd
# Partie wordcloud
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
import urllib
# Partie NLP
import nltk
nltk.download('punkt')
words = nltk.download('stopwords')
nltk.download('vader_lexicon')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re
from tabulate import tabulate
# Partie DataViz
import math
import seaborn as sns
import yfinance as yf
import datetime
import plotly.graph_objects as go
import plotly.express as px
# Partie modelisation/optmisation
from numpy import matrix, array, zeros, empty, sqrt, ones, dot, append, mean, cov, transpose, linspace
from numpy.linalg import inv, pinv
import pandas_datareader as pdr
from datetime import datetime
from pylab import *
import scipy.optimize
import random
import warnings
warnings.filterwarnings("ignore")
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package stopwords to /home/jovyan/nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package vader_lexicon to [nltk_data] /home/jovyan/nltk_data... [nltk_data] Package vader_lexicon is already up-to-date!
L'objectif de cette partie est de récupérer tout les titres des news associés aux différentes entreprises capitalisées dans le S&P500 qui correspond au premier indice boursier mondial et représente plus de 70 % de la capitalisation boursière de New York. Il est composé des 500 plus grandes entreprises. Pour cela on effectue du webscrapping sur le site Investing.com plateforme financière et site web d'information qui répertorie les dernières news associées à chaque entreprises. L’objectif est d’obtenir un dictionnaire contenant les entreprises ainsi que les titres des 30 dernières news associées.
def get_company():
"""
Cette fonction récupère une liste contenant le noms des différentes entreprises sur Investing.com
Sortie = liste des titres de news
"""
url = 'https://fr.investing.com/indices/investing.com-us-500-components'
r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}) #on utilise User agent pour faire croire au site que c'est un veritable utilisateur qui souhaite se connecter et pas un bot
soup = BeautifulSoup(r.content, 'html.parser')
tbody = soup.findAll("td",{"class":"bold left noWrap elp plusIconTd"})
l = []
for p in tbody:
l.append(p.find("a")['href'].split('/')[2])
return l
liste_entreprise = get_company()
#liste_entreprise
def get_news(company,nb_page=1):
"""
Cette fonction permet de récupérer les titres des news associés à une compagnie sur une page donnée
Entrée :
company = le nom de l'entreprise dont on souhaite obtenir les news (format = chaine de caractère). Elle doit correspondre à une page d'investing.com
nb_page = numero de la page sur lesquels on souhaite retourner les news (1 page = 10 titres) (format = int), initialisé à 1
Sortie:
liste = liste de titre de news
"""
url = 'https://www.investing.com/equities/'+company+'-news/'+str(nb_page)
r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(r.content, 'html.parser')
article_body = soup.findAll("a",{"class":"title"})
#print(article_body)
l=[]
for p in article_body:
l.append(p.get("title"))
return l[6:] #les 6 premiers titre son des pub (sur toute les pages...)
print(get_news("tesla-motors",3))
['European Markets Jump to Start 2022: Autos, Airplanes, Travel Lead the Way', 'Pre-Market Movers: Tesla jumps, Ford, Boeing rise; Ross Stores dips', "EV Delivery Numbers, a 2021 Encore, Oil Jolt - What's Moving Markets", 'Tesla Pops Premarket, Chinese EV Firms Also Gain on Q4 Sales Momentum', 'CoinQuora’s Top 10 Promising Coins in 2021', "LG Energy Solution opens books for South Korea's largest IPO at up to $10.8 billion", 'Top Lessons of Crypto 2021', 'Global Markets in 2021: Recoveries, reflation and wrecking balls', "China's Tencent builds stake in UK digital bank Monzo", 'Pre-Market Movers: AMD rises, Peloton and Tesla Dip, Exxon and Pfizer on Watch', 'Should you use a trading bot? The good, the bad, and the ugly of algorithmic trading.', 'Alvexo - Trading CFDs with Candlesticks', 'Alvexo - Non-Farm Payroll Report (NFP): Live Trading Event', 'Alvexo - Getting Started with Technical Analysis']
def get_tilte_dictionary(company_list, number_page=1):
"""
Cette fonction permet de créer un dictionnaire en associant chaque companie avec ces news
Entrée :
company_list = liste des noms d'entreprises dont on souhaite obtenir les news
nb_page = nombre de pages sur lesquels on souhaite itérer
Sortie:
dictionnaire : clé = companie
valeur = texte contenant tout les titres
"""
dicto = {}
for company in company_list :
l = []
for page in range(1,number_page+1):
l = l + get_news(company, page)
dicto[str(company)] = ' '.join(l).lower()
return dicto
Pour donner un exemple on effectue un wordcloud avec l’entreprise Boeing
data = get_tilte_dictionary(liste_entreprise[:100],1)
print(data['boeing-co'])
u.s. stocks higher at close of trade; dow jones industrial average up 0.68% pre-market movers: tesla jumps, ford, boeing rise; ross stores dips jet maker safran plans 12,000 hires in 2022 as air traffic recovers israel signs deal to buy $3.1 billion in u.s. helicopters, tankers u.s. stocks mixed at close of trade; dow jones industrial average up 0.25% boeing 737 max to resume flights in indonesia – report s&p 500 ends lower after four-day rally to record high u.s. stocks mixed at close of trade; dow jones industrial average up 0.26% s&p 500 snaps record run after briefly topping 4,800 s&p 500 eases from record as tech stumble offsets rebound in travel stocks should you use a trading bot? the good, the bad, and the ugly of algorithmic trading. alvexo - trading cfds with candlesticks alvexo - non-farm payroll report (nfp): live trading event alvexo - getting started with technical analysis
mask = np.array(Image.open(requests.get('https://www.freeiconspng.com/uploads/airplane-icon-image-gallery-1.png', stream=True).raw))
wordcloud = WordCloud(background_color = 'white', max_words = 50, mask = mask).generate(data['boeing-co'])
plt.imshow(wordcloud)
plt.axis("off")
plt.show();
sns.histplot([len(i) for i in data['boeing-co'].split() ])
plt.title('histogram de la longueur des mots pour les news de boeing')
Text(0.5, 1.0, 'histogram de la longueur des mots pour les news de boeing')
L’idée de cette partie consiste à analyser les sentiments donnés par les 30 titres des dernières news et de récupérer les 20 entreprises dont les sentiments sont les plus positifs afin de réaliser un portefeuille à partir de ces 20 capitalisations.
def clean_text(text):
"""
Cette fonction permet de nettoyer les textes pour pouvoir les traiter et les analyser
Entrée :
texte = texte brut contenant les titres des news associés à une compagnie
Sortie:
texte = texte modifié
"""
words = nltk.word_tokenize(text)
words = [word for word in words if word.isalpha()]
stop_words = set(stopwords.words('english'))
keep_words = ['up', 'down', 'under','no'] #modifier cette liste ????
stop_words = [word for word in stop_words if word not in keep_words]
words = [ word for word in words if not word in stop_words]
stemmer = SnowballStemmer(language = 'english')
stemmed = [stemmer.stem(word) for word in words]
return(' '.join(stemmed))
def return_sentiment(company_data):
"""
Cette fonction permet de retourner l'analyse de sentiment à une compagnie donnée.
Entrée :
company_data = texte de titre de news associé à une companie
Sortie:
texte = score d'analyse de sentiment avec des poids de 1.5 aux mots à connotation positive, 1 aux mots neutres et -1.5 pour les mots à connotation négatives. Ces poids ont étés choisi arbitrairement donc discutable.
"""
text = clean_text(company_data)
sid = SentimentIntensityAnalyzer()
l = sid.polarity_scores(clean_text(company_data))
return(l['pos']*1.5 + l['neu'] - l['neg']*1.5) #fonction faites à la main,à discuter
def build_data_sentiment(company_list, page=1):
"""
Cette fonction permet de compiler toutes les fonctions précedemment définis. A partir d'une liste de compagnie, cette fonction retourne un dataframe avec deux colonnes les companies associés à leur score d'analyse de sentiment
Entrée :
company_list = liste des noms d'entreprises dont on souhaite obtenir les news
nb_page = nombre de pages sur lesquels on souhaite itérer
Sortie:
df = dataframe avec une colonne des compagnie et une colonne de score d'analyse de sentiment
"""
data = get_tilte_dictionary(company_list, page)
df = pd.DataFrame(columns = ['company','score'])
for company in company_list :
df = df.append({'company': str(company), 'score':return_sentiment(data[company])},ignore_index=True)
return df
df = build_data_sentiment(liste_entreprise,3) #surment un peu long pour le dataframe complet !!!
df = df.set_index('company')
ordered_df = df['score'].sort_values(ascending = False)
kept_company = ordered_df[:20]
print(kept_company)
company signet-jewelers-limited 1.0240 cboe-holdings-inc 0.9690 hershey-co 0.9675 pepsico 0.9670 zions-bancorp 0.9665 nike 0.9650 motorola-inc 0.9630 zoetis-inc 0.9620 metlife-inc 0.9585 waste-managemnt 0.9575 kohls-corp 0.9550 eqt-corporation 0.9545 polo-ralph-laur 0.9540 kroger-co 0.9525 eastman-chem 0.9520 essex-property-trust-inc 0.9515 pnc-fin-serv 0.9510 eaton 0.9495 intuitive-surgical-inc 0.9495 te-connectivity 0.9465 Name: score, dtype: float64
L’objectif de cette partie est d’explorer les données financières du portefeuille constitué suite à l’analyse de sentiment au moyen de data visualisation et Yahoo Finance.
def get_tickers(company):
"""
Cette fonction retourne le ticker d'une entreprise à partir de son nom sur investing
Entrée:
company : string contenant le nom de l'entreprise sur investing
Sortie:
ticker : string contenant le ticker correspondant
"""
url = 'https://www.investing.com/equities/'+company
r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(r.content, 'html.parser')
article_body = soup.findAll("h1",{"class":"text-2xl font-semibold instrument-header_title__GTWDv mobile:mb-2"})
#print(article_body)
ticker = str(re.search(r'\((.*?)\)',str(article_body)).group(0) )
ticker = re.search(r'[a-zA-Z]+',ticker ).group(0)
return ticker
def get_ticker_list(kept_company):
"""
Cette fonction retourne la liste des tickers d'une entreprise à partir de son nom sur investing
Entrée:
kept_company: liste des compagnies gardées suite à l'analyse de sentiment contenant les noms des entreprises sur investing
Sortie:
liste : liste contenant les tickers correspondant
"""
l=[]
for i in range(len(kept_company)):
l.append(get_tickers(kept_company.index[i]))
return l
tickers = get_ticker_list(kept_company)
def retrieve_ticker_name(tickers):
"""
Cette fonction permet de retrouver les noms réels des compagnies à partir de la liste des tickers
Entrée:
tickers : liste des tickers des entreprises
Sortie:
liste : liste contenant les noms réels des entreprises
"""
l = []
for i in tickers:
stock = yf.Ticker(i)
if stock.info['longName'] == None:
l.append(i)
else:
l.append(stock.info['longName'])
return(l)
ticker_name = retrieve_ticker_name(tickers)
def load_data(tickers, start = datetime.datetime(2012,5,31), end = datetime.datetime.now() ):
"""
Cette fonction permet de charger les données financières associées aux tickers.
Entrée:
tickers : liste des tickers des entreprises
start : date de début de la plage de temps choisie, initialisé au 31/05/2012
end : date de fin de la plage de temps choisie, initialisé à la date d'aujourd'hui
Sortie:
df : dataframe contenant les données financières associés aux différents tickers
"""
df = yf.download(tickers, start, end, group_by="ticker")
#print(df)
return(df)
data = load_data(tickers)
[*********************100%***********************] 20 of 20 completed
def keep_close(tickers, data):
"""
Cette fonction permet de garder seulement les données financières de clôture des marchés (ajustées) associées aux tickers.
Entrée:
tickers : liste des tickers des entreprises
data : dataframe contenant les données financières associés aux différents tickers
Sortie:
df : dataframe contenant les données de clôture ajustées associés aux différents tickers
"""
df = pd.DataFrame()
for i in tickers:
Close = data[i]["Adj Close"]
df[i] = Close
return df
data_close = keep_close(tickers, data)
df = data[tickers[0]]
fig = go.Figure(data=[go.Candlestick(x=df.index,
open=df.Open,
high=df.High,
low=df.Low,
close=df.Close)])
fig.update_layout(
title=ticker_name[0]+' full stock price',
yaxis_title="Price in $ USD")
fig.show()
def tendance(prix, window = 5):
"""
Cette fonction renvoie le moving average du prix de cloture
Entrée :
prix : prix de cloture
window = fenêtre de calcul
Sortie:
df : dataframe dont les colonnes sont: prix et moving average
"""
l = list(prix[:window])
for i in range(window, len(prix) ):
l.append(mean(list(prix[i-5:i])))
df = pd.Series(l, index = prix.index)
return df
fig = plt.figure(figsize=(22, 50))
rows = 10
columns =2
grid = plt.GridSpec(rows, columns, wspace = .25, hspace = .25)
for i in range(rows*columns):
exec (f"plt.subplot(grid{[i]})")
plt.plot(data_close[tickers[i]])
df = tendance(data_close[tickers[i]], 10)
plt.plot(df)
plt.title(ticker_name[i] +" stock price")
plt.ylabel("Price in $ USD")
data_log_close = data_close.apply(np.log, axis=1)
data_return = data_log_close - data_log_close.shift(1)
data_return = data_return.iloc[1: , :]
corr = data_return.corr()
corr.style.background_gradient(cmap='coolwarm')
| SIG | CBOE | HSY | PEP | ZION | NKE | MSI | ZTS | MET | WM | KSS | EQT | RL | KR | EMN | ESS | PNC | ETN | ISRG | TEL | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SIG | 1.000000 | 0.270812 | 0.225659 | 0.211784 | 0.407821 | 0.319153 | 0.292325 | 0.218256 | 0.423386 | 0.297961 | 0.473980 | 0.248494 | 0.388806 | 0.126152 | 0.401957 | 0.300928 | 0.447467 | 0.414018 | 0.220233 | 0.434039 |
| CBOE | 0.270812 | 1.000000 | 0.354646 | 0.368007 | 0.313979 | 0.295410 | 0.315253 | 0.304580 | 0.351844 | 0.387143 | 0.214613 | 0.092149 | 0.214337 | 0.111191 | 0.268787 | 0.353212 | 0.404260 | 0.316990 | 0.250908 | 0.351376 |
| HSY | 0.225659 | 0.354646 | 1.000000 | 0.588932 | 0.199968 | 0.308428 | 0.339209 | 0.339139 | 0.304143 | 0.489934 | 0.171386 | 0.145277 | 0.183984 | 0.197973 | 0.296447 | 0.428129 | 0.344562 | 0.351688 | 0.283110 | 0.359130 |
| PEP | 0.211784 | 0.368007 | 0.588932 | 1.000000 | 0.266467 | 0.391932 | 0.422739 | 0.427964 | 0.422891 | 0.551922 | 0.205455 | 0.156539 | 0.272368 | 0.242764 | 0.344366 | 0.483003 | 0.404788 | 0.433359 | 0.340990 | 0.407563 |
| ZION | 0.407821 | 0.313979 | 0.199968 | 0.266467 | 1.000000 | 0.380137 | 0.391086 | 0.267225 | 0.749921 | 0.378570 | 0.463906 | 0.295628 | 0.492790 | 0.136785 | 0.598087 | 0.347171 | 0.811036 | 0.574015 | 0.286692 | 0.542099 |
| NKE | 0.319153 | 0.295410 | 0.308428 | 0.391932 | 0.380137 | 1.000000 | 0.404348 | 0.416604 | 0.459184 | 0.416528 | 0.346025 | 0.180701 | 0.442991 | 0.162245 | 0.411268 | 0.358537 | 0.451005 | 0.469428 | 0.373056 | 0.482661 |
| MSI | 0.292325 | 0.315253 | 0.339209 | 0.422739 | 0.391086 | 0.404348 | 1.000000 | 0.445087 | 0.463856 | 0.487439 | 0.299344 | 0.167165 | 0.350396 | 0.183419 | 0.438616 | 0.394136 | 0.471237 | 0.486739 | 0.398183 | 0.463051 |
| ZTS | 0.218256 | 0.304580 | 0.339139 | 0.427964 | 0.267225 | 0.416604 | 0.445087 | 1.000000 | 0.400741 | 0.438956 | 0.207965 | 0.182463 | 0.253688 | 0.132003 | 0.354082 | 0.356839 | 0.391727 | 0.443606 | 0.459403 | 0.459622 |
| MET | 0.423386 | 0.351844 | 0.304143 | 0.422891 | 0.749921 | 0.459184 | 0.463856 | 0.400741 | 1.000000 | 0.470181 | 0.464359 | 0.302773 | 0.517925 | 0.180782 | 0.658183 | 0.403501 | 0.787904 | 0.663655 | 0.390822 | 0.625816 |
| WM | 0.297961 | 0.387143 | 0.489934 | 0.551922 | 0.378570 | 0.416528 | 0.487439 | 0.438956 | 0.470181 | 1.000000 | 0.286761 | 0.175268 | 0.333255 | 0.186640 | 0.417870 | 0.496862 | 0.508004 | 0.509865 | 0.346698 | 0.469348 |
| KSS | 0.473980 | 0.214613 | 0.171386 | 0.205455 | 0.463906 | 0.346025 | 0.299344 | 0.207965 | 0.464359 | 0.286761 | 1.000000 | 0.261971 | 0.535752 | 0.137997 | 0.421913 | 0.334397 | 0.470914 | 0.415248 | 0.225536 | 0.382105 |
| EQT | 0.248494 | 0.092149 | 0.145277 | 0.156539 | 0.295628 | 0.180701 | 0.167165 | 0.182463 | 0.302773 | 0.175268 | 0.261971 | 1.000000 | 0.209747 | 0.119089 | 0.326734 | 0.121972 | 0.286879 | 0.308889 | 0.136463 | 0.264051 |
| RL | 0.388806 | 0.214337 | 0.183984 | 0.272368 | 0.492790 | 0.442991 | 0.350396 | 0.253688 | 0.517925 | 0.333255 | 0.535752 | 0.209747 | 1.000000 | 0.114226 | 0.470300 | 0.312893 | 0.487110 | 0.448183 | 0.261942 | 0.455957 |
| KR | 0.126152 | 0.111191 | 0.197973 | 0.242764 | 0.136785 | 0.162245 | 0.183419 | 0.132003 | 0.180782 | 0.186640 | 0.137997 | 0.119089 | 0.114226 | 1.000000 | 0.134470 | 0.114040 | 0.168024 | 0.152420 | 0.101728 | 0.140472 |
| EMN | 0.401957 | 0.268787 | 0.296447 | 0.344366 | 0.598087 | 0.411268 | 0.438616 | 0.354082 | 0.658183 | 0.417870 | 0.421913 | 0.326734 | 0.470300 | 0.134470 | 1.000000 | 0.365072 | 0.626810 | 0.669126 | 0.355060 | 0.626401 |
| ESS | 0.300928 | 0.353212 | 0.428129 | 0.483003 | 0.347171 | 0.358537 | 0.394136 | 0.356839 | 0.403501 | 0.496862 | 0.334397 | 0.121972 | 0.312893 | 0.114040 | 0.365072 | 1.000000 | 0.447726 | 0.408633 | 0.331821 | 0.422747 |
| PNC | 0.447467 | 0.404260 | 0.344562 | 0.404788 | 0.811036 | 0.451005 | 0.471237 | 0.391727 | 0.787904 | 0.508004 | 0.470914 | 0.286879 | 0.487110 | 0.168024 | 0.626810 | 0.447726 | 1.000000 | 0.650546 | 0.373748 | 0.620789 |
| ETN | 0.414018 | 0.316990 | 0.351688 | 0.433359 | 0.574015 | 0.469428 | 0.486739 | 0.443606 | 0.663655 | 0.509865 | 0.415248 | 0.308889 | 0.448183 | 0.152420 | 0.669126 | 0.408633 | 0.650546 | 1.000000 | 0.420509 | 0.658337 |
| ISRG | 0.220233 | 0.250908 | 0.283110 | 0.340990 | 0.286692 | 0.373056 | 0.398183 | 0.459403 | 0.390822 | 0.346698 | 0.225536 | 0.136463 | 0.261942 | 0.101728 | 0.355060 | 0.331821 | 0.373748 | 0.420509 | 1.000000 | 0.442776 |
| TEL | 0.434039 | 0.351376 | 0.359130 | 0.407563 | 0.542099 | 0.482661 | 0.463051 | 0.459622 | 0.625816 | 0.469348 | 0.382105 | 0.264051 | 0.455957 | 0.140472 | 0.626401 | 0.422747 | 0.620789 | 0.658337 | 0.442776 | 1.000000 |
L'idée de cette deuxième partie est de modéliser la construction d'un portefeuille à partir des prix de clôtures et des capitalisations boursières des entreprises considérées.
Dans un premier temps, l'objectif est de construire les différentes métriques requises pour appliquer un modèle classique de construction de portefeuille, le CAPM : L'objectif ici est de calculer les rendements de chaque actifs considérés ainsi que leur volatilité mesurée par l'écart-type. Il est également nécéssaire de calculer les covariances entre les différents actifs.
Dans un second temps, on arrive à calculer les poids de chaque entreprise dans le S&P500 grâce à une division du MarketCap de chaque entreprise sur le MarketCap total de chaque entreprise.
Enfin, on applique le CAPM et la théorie moderne du portefeuille de Markowitz en implémentant le ratio de Sharpe, l'idée étant de trouver le portefeuille de Variance minimale située à l'extrème gauche de la frontière efficiente.
Pour cela, on a besoin d'une fonction optimizer permettant de déterminer les actifs dont le couple rendement/écart-type permet de déterminer ce portefeuille de variance minimale.
def calcul_meanvar(names, prices, caps): #données entrée = noms action, prix de cloture (pas les return, on les recalcul dans la fonction), et le market cap
"""
Cette fonction calcul les poids des entreprises, les rendements exponentiels, ainsi que les matrices de covariances
Entrée:
names : noms des entreprises
prices : prix de clotûre des actions
caps : market cap
Sortie:
names : la même chose qu'en entrée
weight : liste des poids des entreprises en fonction de leurs market cap
returns_exp : liste des moyennes des rendements exponentiel des prix de clotures
covars : matrice de covariance des rendements expontentiels moyen
"""
prices.dropna(axis=0,inplace=True)
prices = matrix(prices).transpose() #matrice des prix
weights = array(caps) / sum(caps) # matrice des poids
# matrice des rendements historiques
rows, cols = prices.shape
returns = empty([rows, cols-1])
for r in range(rows):
for c in range(cols-1):
p0, p1 = prices[r,c], prices[r,c+1]
returns[r,c] = (p1/p0)-1
# calcul des rendements moyens
returns_exp = array([])
for r in range(rows):
returns_exp = append(returns_exp, np.mean(returns[r]))
# calcul des covariances
covars = cov(returns)
returns_exp = (1+returns_exp)**250-1 # on annualise les rendements (conpound interest
covars = covars * 250 # on annualise les covariances
return names, weights, returns_exp, covars
# rf (ou r) taux sans risque (fixé arbitrairement)
# C matrice variance covariance = covars
# W poids des actifs = weigth
# R rendements des actifs = returns_exp
from pandas_datareader import data
def temporary_market_cap(tickers):
return list(data.get_quote_yahoo(tickers)['marketCap'])
def mean_ptf(W, R): #calcul du rendement moyen du portefeuille
return sum(R*W)
def var_ptf(W, C): #calcul de la variance du portefeuille
return np.dot(np.dot(W, C), W)
def fit_model(W, R, C, r=0.2):
# pour un niveau donné de rendement, on calcule le portefeuille qui minimise la variance
mean = mean_ptf(W, R)
var = var_ptf(W, C)
penalty = 50*abs(mean-r)
return var + penalty
def mean_ptf_var(W,R,C):
return mean_ptf(W,R), var_ptf(W,C)
# Fonction qui construit la frontière efficiente de markowitz (ensemble des portefeuilles efficients
def build_frontier(R, C, rf= 0.2): #je mets 0.2 par defaut pour tester
frontier_mean, frontier_var, frontier_weights = [], [], []
n = len(R)
for r in linspace(min(R), max(R), num=20):
W = ones([n])/n # matrice initialisé des poids (poids équilibré entre tous les actifs au départs)
b_ = [(0,1) for i in range(n)]
c_ = ({'type':'eq', 'fun': lambda W: sum(W)-1. }) # on restreint les poids à 100% du portefeuille
optimized = scipy.optimize.minimize(fit_model, W, (R, C, r), method='SLSQP', constraints=c_, bounds=b_) # algo de minimisation
#print(optimized)
if not optimized.success:
raise BaseException("bug opti")
frontier_mean.append(r)
frontier_var.append(var_ptf(optimized.x, C))
frontier_weights.append(optimized.x)
return array(frontier_mean), array(frontier_var), frontier_weights
#calcul du portefeuille optimal au sens de markowitz
def build_weights(R, C, rf):
def fit_model(W, R, C, rf):
mean, var = mean_ptf_var(W, R, C)
util = (mean - rf) / sqrt(var)
return 1/util
n = len(R)
W = ones([n])/n # matrice initialisée des poids (poids équilibrés entre tous les actifs au départs)
b_ = [(0.,1.) for i in range(n)]
c_ = ({'type':'eq', 'fun': lambda W: sum(W)-1. }) # on restreint les poids à 100% du portefeuille
optimized = scipy.optimize.minimize(fit_model, W, (R, C, rf), method='SLSQP', constraints=c_, bounds=b_)
if not optimized.success:
raise Exception("bug opti")
return optimized.x
from matplotlib.pyplot import figure
def ptf_optim(title, names, R, C, rf, color="red"):
# optimisation du portefeuille
W = build_weights(R, C, rf)
mean, var = mean_ptf_var(W, R, C)
f_mean, f_var, f_weights = build_frontier(R, C, rf)
# on affiche le point de variance minimale
print(title)
print("\n")
print_assets(names, W, R, C)
n = len(names)
figure(figsize=(18, 10), dpi=80)
scatter([C[i,i]**.5 for i in range(n)], R, marker='o',color=color)
for i in range(n):
text(C[i,i]**.5, R[i], ' %s'%names[i], verticalalignment='center', color="red")
scatter(var**.5, mean, marker='o', color="red")
plot(f_var**.5, f_mean, color=color)
xlabel('$\sigma$'), ylabel('$r$')
plt.title("Frontière efficiente et allocation optimale")
plt.grid(True)
#### APPLICATION DE L'ENSEMBLE DES FONCTIONS PRECEDENTES POUR UNE ETUDE DE CAS
names, prices, caps = ticker_name, data_close, temporary_market_cap(tickers)
n = len(names) #chargement des données
# on calcule les rendements attendus et les covariances de nos actifs
names, W, R, C = calcul_meanvar(names, prices, caps)
rf = .025 #rendement actif sans risque
def print_assets(names, W, R, C):
df = pd.DataFrame( columns = ["Nom", "Poids", "Rendements", "Std"])
#print("%-10s %6s %6s %6s %s" % ("Nom", "Poids", "Rendements", "Std", " Correlations"))
for i in range(len(names)):
df = df.append( {"Nom":names[i], 'Poids': round(100*W[i],3), 'Rendements': round(100*R[i],3) ,'Std': round(100*C[i,i]**.5,3)}, ignore_index = True
)
#print("%-10s %5.1f%% %5.1f%% %5.1f%% " % (names[i], 100*W[i], 100*R[i], 100*C[i,i]**.5), end='')
#for j in range(i+1):
# corr = C[i,j] / (sqrt(C[i,i]) * (sqrt(C[j,j]))) # calcul des correlation à partir de la covariance
# print("%.3f " % corr, end='')
df.index = df.Nom
df = df.drop('Nom', axis = 1)
print(df.to_markdown())
print("Allocation en utilisant les capitalisations (approche historique)")
print_assets(names, W, R, C)
# on calcule le rendement historique de notre portefeuille et sa variance
mean, var = mean_ptf_var(W, R, C)
# optimisation de notre portfeuille au sens de markowitz (maximise le ratio de sharpe) basé sur les prix historiques
print("\n")
ptf_optim("Optimisation au sens de Markowitz (approche du modèle)", names, R, C, rf, color='black')
show()
Allocation en utilisant les capitalisations (approche historique) | Nom | Poids | Rendements | Std | |:-------------------------------------------|--------:|-------------:|-------:| | Signet Jewelers Limited | 0.383 | 27.5 | 60.374 | | Cboe Global Markets, Inc. | 1.072 | 20.64 | 24.481 | | The Hershey Company | 3.114 | 15.357 | 21.007 | | PepsiCo, Inc. | 18.712 | 15.249 | 18.076 | | Zions Bancorporation, National Association | 0.786 | 19.506 | 32.331 | | NIKE, Inc. | 20.392 | 27.981 | 26.102 | | Motorola Solutions, Inc. | 3.534 | 24.124 | 23.989 | | Zoetis Inc. | 8.663 | 29.941 | 24.546 | | MetLife, Inc. | 4.161 | 16.03 | 30.514 | | Waste Management, Inc. | 5.326 | 23.125 | 18.047 | | Kohl's Corporation | 0.54 | 17.029 | 47.522 | | EQT Corporation | 0.645 | 6.757 | 46.584 | | Ralph Lauren Corporation | 0.696 | 5.168 | 36.014 | | The Kroger Co. | 2.599 | 20.517 | 27.505 | | Eastman Chemical Company | 1.258 | 13.011 | 29.04 | | Essex Property Trust, Inc. | 1.802 | 16.315 | 24.125 | | The PNC Financial Services Group, Inc. | 6.885 | 21.634 | 27.291 | | Eaton Corporation plc | 5.26 | 20.236 | 27.165 | | Intuitive Surgical, Inc. | 10.062 | 26.873 | 30.758 | | TE Connectivity Ltd. | 4.11 | 23.265 | 25.532 | Optimisation au sens de Markowitz (approche du modèle) | Nom | Poids | Rendements | Std | |:-------------------------------------------|--------:|-------------:|-------:| | Signet Jewelers Limited | 0 | 27.5 | 60.374 | | Cboe Global Markets, Inc. | 7.557 | 20.64 | 24.481 | | The Hershey Company | 0 | 15.357 | 21.007 | | PepsiCo, Inc. | 0 | 15.249 | 18.076 | | Zions Bancorporation, National Association | 0 | 19.506 | 32.331 | | NIKE, Inc. | 13.428 | 27.981 | 26.102 | | Motorola Solutions, Inc. | 3.782 | 24.124 | 23.989 | | Zoetis Inc. | 24.433 | 29.941 | 24.546 | | MetLife, Inc. | 0 | 16.03 | 30.514 | | Waste Management, Inc. | 32.146 | 23.125 | 18.047 | | Kohl's Corporation | 0 | 17.029 | 47.522 | | EQT Corporation | 0 | 6.757 | 46.584 | | Ralph Lauren Corporation | 0 | 5.168 | 36.014 | | The Kroger Co. | 14.794 | 20.517 | 27.505 | | Eastman Chemical Company | 0 | 13.011 | 29.04 | | Essex Property Trust, Inc. | 0 | 16.315 | 24.125 | | The PNC Financial Services Group, Inc. | 0 | 21.634 | 27.291 | | Eaton Corporation plc | 0 | 20.236 | 27.165 | | Intuitive Surgical, Inc. | 3.86 | 26.873 | 30.758 | | TE Connectivity Ltd. | 0 | 23.265 | 25.532 |
L'idée de cette troisième partie c'est d’arriver à trouver et de construire des index qui ne sont pas disponible gratuitement. On a choisi de reproduire les poids des industries dans le S&P500 à partir de la taxonomie des GICS (crée par Standard & Poor’s).
def geteGics(tickers):
industryGics = []
for tick in tickers:
# rotation de ticker par page web
url = 'https://eresearch.fidelity.com/eresearch/evaluate/snapshot.jhtml?symbols='+tick
page = requests.get(url, verify=False)
# Trouver a acceder a l'endroit precis dans la page.
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find('div',{'id': 'companyProfile'})
table = table.find_all('div',{'class': 'sub-heading'})[1]
table = table.text.replace('\n','').replace('Industry (GICS®)','')
industryGics.append(table.replace('.','/'))
return industryGics
Industry_names=geteGics(tickers)
Industry_names
weights = temporary_market_cap(tickers)
weights = array(weights) / sum(weights)
# Creation d'un DataFrame
d = {'Ticker': tickers,'Weight': weights, 'Industry':Industry_names}
df = pd.DataFrame(d)
# Les deux sites utilisent des formats de texte differents, surtout pour le and et & mais aussi majuscule et miniscule
# Donc on format tout de la meme maniere
df['Industry']=df['Industry'].str.replace('and', '&').str.title()
df.reset_index(drop=True, inplace = True)
df.head(10)
| Ticker | Weight | Industry | |
|---|---|---|---|
| 0 | SIG | 0.003830 | Specialty Retail |
| 1 | CBOE | 0.010718 | Capital Markets |
| 2 | HSY | 0.031140 | Food Products |
| 3 | PEP | 0.187118 | Beverages |
| 4 | ZION | 0.007864 | Banks |
| 5 | NKE | 0.203917 | Textiles, Apparel & Luxury Goods |
| 6 | MSI | 0.035338 | Communications Equipment |
| 7 | ZTS | 0.086628 | Pharmaceuticals |
| 8 | MET | 0.041611 | Insurance |
| 9 | WM | 0.053261 | Commercial Services & Supplies |
def extract_content(url):
# Fonction d'extraction des données de la page
page = requests.get(url, verify=False)
soup = BeautifulSoup(page.content,'html.parser')
content = soup.prettify()
return content
def parse_content():
# Fonction qui recupere toute la table Wikipédia avec le nom du sous secteur et son code.
content = extract_content('https://en.wikipedia.org/wiki/Global_Industry_Classification_Standard')
content = content.split('wikitable')[1]
content = content.replace('/','')
content = content.replace('\n','')
content = content.replace('amp;','')
content = content.replace('<td>','<td>/')
lines = content.split('<tr>')
tout = ''
for i in lines[2:] :
text = i.split('<td>')
for j in text :
j = re.sub('<[^>]+>', '', j)
tout += j
tout=re.sub(' +',' ',tout)
tout = tout.replace('/ /','/')
tout = tout.split('Revision')[0]
tout = tout.strip()
tout = tout.split('/')
liste_code = []
liste_int = []
for i in range(0, len(tout)-1,2):
liste_code.append(tout[i].strip())
liste_int.append(tout[i+1].strip())
manpage = pd.DataFrame()
manpage['code'] = liste_code
manpage['intitule'] = liste_int
manpage['intitule'] = manpage['intitule'].str.replace('and','&').str.title()
return manpage
mainpage = parse_content()
mainpage.head()
| code | intitule | |
|---|---|---|
| 0 | 10 | Energy |
| 1 | 1010 | Energy |
| 2 | 101010 | Energy Equipment & Services |
| 3 | 10101010 | Oil & Gas Drilling |
| 4 | 10101020 | Oil & Gas Equipment & Services |
def get_ind(sub_ind):
# Fonction qui arrive a detecter pour chaque ticker toutes les types d'industrie et/ou sous industrie qui fait partie
possible = mainpage[mainpage['intitule'] == sub_ind]['code'].tolist()
ind = ''
ind_gr = ''
sector = ''
if len(possible)>0:
code = possible[len(possible)==8]
ind = mainpage[mainpage['code'] == code[:6]]['intitule'].tolist()[0]
ind_gr = mainpage[mainpage['code'] == code[:4]]['intitule'].tolist()[0]
sector = mainpage[mainpage['code'] == code[:2]]['intitule'].tolist()[0]
return (ind,ind_gr,sector)
# Creation des nouvelles colonnes dans le dataframe initial
df['Industry'] = df['Industry'].map(lambda x: get_ind(x)[0])
df['Industry_Group'] = df['Industry'].map(lambda x: get_ind(x)[1])
df['Sector'] = df['Industry'].map(lambda x: get_ind(x)[2])
df = df[['Ticker','Weight','Industry_Group']]
df['Weight'] = df['Weight'].astype(str).astype('float64')
df.head()
| Ticker | Weight | Industry_Group | |
|---|---|---|---|
| 0 | SIG | 0.003830 | Retailing |
| 1 | CBOE | 0.010718 | Diversified Financials |
| 2 | HSY | 0.031140 | Food, Beverage & Tobacco |
| 3 | PEP | 0.187118 | Food, Beverage & Tobacco |
| 4 | ZION | 0.007864 | Banks |
byindus = df.groupby(['Industry_Group']).sum()
byindus
| Weight | |
|---|---|
| Industry_Group | |
| Banks | 0.076713 |
| Capital Goods | 0.052597 |
| Commercial & Professional Services | 0.053261 |
| Consumer Durables & Apparel | 0.210882 |
| Diversified Financials | 0.010718 |
| Energy | 0.006452 |
| Food & Staples Retailing | 0.025995 |
| Food, Beverage & Tobacco | 0.218258 |
| Health Care Equipment & Services | 0.100616 |
| Insurance | 0.041611 |
| Materials | 0.012583 |
| Pharmaceuticals, Biotechnology & Life Sciences | 0.086628 |
| Real Estate | 0.018019 |
| Retailing | 0.009228 |
| Technology Hardware & Equipment | 0.076440 |
fig = px.pie(byindus, values='Weight', names=byindus.index, title='Percentage of weigths by Industry group')
fig.show()
##### on a trié les poids des actifs dans l'ordre décroissant et on a choisi ne que 10 premières
df_top_10=df.sort_values(by=['Weight'], ascending=False).iloc[:10,]
fig2 = px.pie(df_top_10, values='Weight', names='Ticker', title='Percentage of weigths by Enterprise')
fig2.show()
df_top_10
| Ticker | Weight | Industry_Group | |
|---|---|---|---|
| 5 | NKE | 0.203917 | Consumer Durables & Apparel |
| 3 | PEP | 0.187118 | Food, Beverage & Tobacco |
| 18 | ISRG | 0.100616 | Health Care Equipment & Services |
| 7 | ZTS | 0.086628 | Pharmaceuticals, Biotechnology & Life Sciences |
| 16 | PNC | 0.068850 | Banks |
| 9 | WM | 0.053261 | Commercial & Professional Services |
| 17 | ETN | 0.052597 | Capital Goods |
| 8 | MET | 0.041611 | Insurance |
| 19 | TEL | 0.041103 | Technology Hardware & Equipment |
| 6 | MSI | 0.035338 | Technology Hardware & Equipment |
verification = byindus['Weight'].sum()
if verification > 0.9999:
print("C'est bon")
else:
print("C'est pas bon")
# ca doit faire 1 (ou presque)
C'est bon